############################################################
## R script to run some actual regressions and produce all output
## we are going to include in the paper version for Gov 2001
##
## Chris path '/Users/creidl/Dropbox/Gov 2001 project/'
## Frank path 'C:/Users/fnagle/Documents/My Dropbox/Gov 2001 project/'
##
## Change Log
##     	2012-04-06 initial version based on Regressions.R (Chris)
##		---------- multiple changes and edits in between (Chris)
##      2012-04-19 added computation of quantity of interest: how much more likely to review in 2007 than in 2009?
############################################################

# Load required libraries
library(ggplot2)
library(lme4)
library(Zelig)
library(lmtest)
library(xtable)
library(apsrtable)		# For combining/printing tables of multiple OLS models
library(sandwich)
library(memisc)			# For combining/printing tables of multiple lmer models 


# Some helper code has been moved to other source. Load them.
source("/Users/creidl/Dropbox/T1D/04-dataAnalysis-ratings/corstarsl.R")
source("/Users/creidl/Dropbox/Gov 2001 project/R_dataAnalysis/HelperFunctions.R")

# Settings for graphs
cols=c("#00A600FF", "#FF4000FF")
lw=2

# Read Data
setwd('/Users/creidl/Dropbox/Gov 2001 project/R_dataCollection/')
m <- rbind( cbind( read.csv(file="data/real-final/master-data-WEEK-2007.csv", head=TRUE, sep=","), data.frame(YEAR=2007) ),
			cbind( read.csv(file="data/real-final/master-data-WEEK-2008.csv", head=TRUE, sep=","), data.frame(YEAR=2008) ),
			cbind( read.csv(file="data/real-final/master-data-WEEK-2009.csv", head=TRUE, sep=","), data.frame(YEAR=2009) ) )

# Convert to factor
m$movieID 	<- as.factor(m$movieID)
m$MPAA 		<- as.factor(m$MPAA)
m$MPAA 		<- relevel(m$MPAA, ref="R")
m$WK 		<- as.factor(m$WK)
m$HOLIDAY 	<- as.factor(m$HOLIDAY)
m$YEAR 		<- as.factor(m$YEAR)

## Merge in Oscar Nominations and Wins
oscars <- getOscars('/Users/creidl/Dropbox/Gov 2001 project/python_dataCollection/')
m <- merge(m, oscars, all.x=TRUE)
m$oscarNominations[is.na(m$oscarNominations)] <- 0
m$oscarWins[is.na(m$oscarWins)] <- 0

## Read RottenTomatoes critics rating
critics <- getCritics('/Users/creidl/Dropbox/Gov 2001 project/python_dataCollection/')
m <- merge(m, critics)

# Set wd to latex folder so that all the output files we generate here are in the right location
setwd('/Users/creidl/Dropbox/Gov 2001 project/our-paper/')


##################################################
##################################################
##################################################
##################################################
##################################################
## Summary Statistics: Table 1
##
## Different than original paper but nicer
##
##################################################
##################################################
##################################################
##################################################
##################################################
print( paste("Total number of movies =", length(unique(as.character(use$movieID)))))
print( paste("Total number of user ratings =", sum(use$VOL)))

# Take only first week for each movie
u <- m[!duplicated(m$movieID),]

## Frequency Table of MPAA Rating
mpaa <- table(u$MPAA)
xtable(mpaa, include.rownames=FALSE)

## Frequency Table of Genre
dat <- c(
	nrow(u[u$genreSciFi>0,]),
	nrow(u[u$genreFamily>0,]),
	nrow(u[u$genreDrama>0,]),
	nrow(u[u$genreComedy>0,]),
	nrow(u[u$genreRomance>0,]),
	nrow(u[u$genreAction>0,]),
	nrow(u[u$genreThriller>0,]),
	nrow(u[u$sequel>0,])
)
datMat <- matrix(dat, nrow=8, ncol=1, byrow=TRUE, dimnames=list(c("Science fiction", "Family / Children", "Drama", "Comedy", "Romance", "Action", "Thriller", "Other Property: Sequel"), c("Frequency")))
xtable(datMat)


## Econ Journal style descriptives table with cross-correlation: this is not what Dellarocas has but it looks much nicer
variables <- m[,c("BOX", "VOL", "THEATERS", "RPT", "TSOFAR", "DISCUM", "AVG", "CRIT", "HOLIDAY", "COMPETITION", "oscarNominations", "oscarWins")]
variables$HOLIDAY <- as.numeric( as.character(variables$HOLIDAY))
text <- xtable(corstarsl(variables), label="descriptivesEconStyle")
text
print.xtable(text, file="descriptivesEconStyleFull.tex" )





#######################################
#######################################
#######################################
#######################################
## Table 1: All models
##
#######################################
#######################################
#######################################
#######################################
depVar <- "$log(VOL_{jt}/BOX_{jt})$"
#depVar <- "$log(VOL_{jt}/BOX_{jt})$"
myCaption <- paste("Model 1: Original results published by \\cite{DellarocasGaoNarayan:2010}; Model 2: Same model using our extended dataset for 2007-2009; Model 3: Alternative control variables for movie quality; Model 4: Test for disagreement hypothesis (H4). Model 5: Interaction effects added (H5 and H6). All models OLS.", sep="")
myCustomMsg <- ""

models <- list( 
lm( DEP ~ THEATERS + RPT + TSOFAR + AVG + I(AVG^2) + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY, data=m),
lm( DEP ~ THEATERS + RPT + TSOFAR + AVG + I(AVG^2) + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY, data=m),
lm( DEP ~ THEATERS + RPT + TSOFAR + CRIT + oscarNominations + oscarWins + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY, data=m),
lm( DEP ~ THEATERS + RPT + TSOFAR + DISCUM + CRIT + oscarNominations + oscarWins + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY, data=m),
lm( DEP ~ THEATERS + RPT + TSOFAR*THEATERS + TSOFAR*RPT + DISCUM*THEATERS + DISCUM*RPT + CRIT + oscarNominations + oscarWins + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY, data=m)
)

for(i in 1:length(models)) {
	#models[[i]]$se <- vcovHC(models[[i]])
	models[[i]]$se <- getVcovCL(m, models[[i]], m$movieID)
}

# model 1 should be original data from Dellarocas: copy his data into this structure
#models[[1]]$coefficients[1] <- 1.876
#models[[1]]$se[1] <- 0.0636

t <- apsrtable(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]], stars="default", omitcoef=expression(grep("MPAA|genre|seque|COMPETITION|WK|YEAR|HOLIDAY", coefnames)), digits=3, se="robust", float="table", notes=list(se.note.crse, stars.note, note.gen), caption.position="below", caption=myCaption, order="lr", model.counter=1, label="mainTable1"); writeLines( t, paste("mainTable1.tex", sep=""))

t <- apsrtable(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]], stars="default", omitcoef=expression(grep("gedfdfnre|SubmissionNumber|Country", coefnames)), digits=3, se="robust", float="table", notes=list(se.note.crse, stars.note, note.gen), caption.position="below", caption=myCaption, order="lr", model.counter=1, label="mainTable1Appendix"); writeLines( t, paste("mainTable1Appendix.tex", sep=""))




#######################################
#######################################
#######################################
#######################################
### Table 2: OUR Model with all controls, but by week.
#######################################
#######################################
#######################################
#######################################
depVar <- "$log(VOL_{jt}/BOX_{jt})$"
myCaption <- paste("DV: ", depVar, ". OLS Regression by week. Model 1: WK2; Model 2: WK3; Model 3: WK4; Model 4: WK5.", sep="")
myCustomMsg <- ""

formula <- as.formula( DEP ~ THEATERS + RPT + TSOFAR + DISCUM + CRIT + oscarNominations + oscarWins + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + HOLIDAY + YEAR)

models <- list( 
lm( formula, data=m[m$WK==2,]),
lm( formula, data=m[m$WK==3,]),
lm( formula, data=m[m$WK==4,]),
lm( formula, data=m[m$WK==5,])
)

models[[1]]$se <- getVcovCL(m[m$WK==2,], models[[1]], m$movieID[m$WK==2])
models[[2]]$se <- getVcovCL(m[m$WK==3,], models[[2]], m$movieID[m$WK==3])
models[[3]]$se <- getVcovCL(m[m$WK==4,], models[[3]], m$movieID[m$WK==4])
models[[4]]$se <- getVcovCL(m[m$WK==5,], models[[4]], m$movieID[m$WK==5])

t <- apsrtable(models[[1]], models[[2]], models[[3]], models[[4]], stars="default", omitcoef=expression(grep("WK|genre|MPAA|HOLIDAY|sequel|COMPETITION|AVG|YEAR|oscar|CRIT", coefnames)), digits=3, se="robust", float="table", notes=list(se.note.crse, stars.note, note.gen), caption.position="below", caption=myCaption, order="longest", model.counter=1, label="mainTableByWeek"); writeLines( t, paste("mainTableByWeek.tex", sep=""))





#######################################
### Figure 2: Hump-shaped decay of movie reviews
#######################################
## See extra file: CompleteAnalysisPaperGaryPlotsDayData


#######################################
### Figure 3: Average deviation of movie i’s mean rating from the mean rating of all ratings.
#######################################
## See extra file: CompleteAnalysisPaperGaryPlotsDayData


#######################################
### REPLICATE Figure 3: Density Plot of VOL / BOX
#######################################
x <- m[m$DEP>log(0.0000002),]
pdf("figures/Rplot-Density2Up.pdf", width=7, height=4)
op <- par(mfrow=c(1,2))
d <- density( (x$VOL / x$BOX) ) 
plot(d, main="", xlab="VOL/BOX", ylab="Density") 
d <- density(x$DEP) 
plot(d, main="", xlab="log(VOL/BOX)", ylab="Density") 
par(op)
dev.off()


#######################################
### REPLICATE Figure 4: Population's Average Propensity to Review a Movie
## Note: Where does this linear pattern come from?
## The linear pattern are movies that have the same VOL, namely 1, 2, 3, and 4. Since our dataset is so big now we have many movies that have VOL=1 but varying BOX (N=42), that is the lowest of the linear patterns. Then we have many movies that have VOL=2 but varyuing BOX (N=41), and so on. Hence, it is all natural and nothing to worry about. It just looks funny. The further VOL grows, the less movies we have with the same VOL so the pattern disappears.
#######################################
x <- m[m$VOL>0,]
pdf("figures/Rplot-PropensityToReview-real.pdf", width=5, height=5)
plot(log(x$BOX), x$DEP, xlab="log(BOX)", ylab="log(Propensity to Review)")
model2 <- lm( DEP ~ log(BOX) + I(log(BOX)^2), x )
xfit <- seq(-5, 5, 5/50)
yfit <- model2$coef[1:3] %*% rbind (1, xfit, I(xfit^2))
lines(xfit, yfit, lwd=lw, col=cols[2])
dev.off()



#######################################
### Figure X: Interaction Plot: THEATERS:DISCUM
#######################################
m$AVAIL[m$THEATERS<quantile(m$THEATERS, c(.5))] <- "limited"
m$AVAIL[m$THEATERS>=quantile(m$THEATERS, c(.5))] <- "wide"

m$TSOFARLEV[m$TSOFAR<quantile(m$TSOFAR, c(.5))] <- "low"
m$TSOFARLEV[m$TSOFAR>=quantile(m$TSOFAR, c(.5))] <- "high"
m$TSOFARLEV <- relevel(as.factor(m$TSOFARLEV), "low")

m$DISAG[m$DISCUM<quantile(m$DISCUM, c(.5))] <- "low"
m$DISAG[m$DISCUM>=quantile(m$DISCUM, c(.5))] <- "high"
m$DISAG <- relevel(as.factor(m$DISAG), "low")

pdf("figures/Rplot-Interaction2Up.pdf", width=10, height=5)
op <- par(mfrow=c(1,2))
interaction.plot(m$TSOFARLEV, m$AVAIL, m$DEP, type="b", col=cols, lty=c(1, 1), lwd=lw, pch=c(1, 2), main="Interacting Availability and Prior Reviews", xlab="Volume of Previously Posted Reviews (TSOFAR)", ylab="Propensity to Review", legend=FALSE)
legend("topleft", c("limited", "wide"), pch=c(1, 2), col=cols, title="Legend", inset=.05, lwd=lw)

interaction.plot(m$DISAG, m$AVAIL, m$DEP, type="b", col=cols, lty=c(1, 1), lwd=lw, pch=c(1, 2), main="Interacting Availability and Prior Disagreement", xlab="Level of Previous Disagreement (DISSOFAR)", ylab="Propensity to Review", legend=FALSE)
legend("topleft", c("limited", "wide"), pch=c(1, 2), col=cols, title="Legend", inset=.05, lwd=lw)

par(op)
dev.off()



#######################################
### Quantity of Interst: How much more likely are users to review in 2007 than in 2009?
#######################################
fit1 <- lm(DEP ~ 1, m[m$YEAR==2007,])	# Model with only the intercept
fit2 <- lm(DEP ~ 1, m[m$YEAR==2009,])
x <- fit1$coefficients[1]
y <- fit2$coefficients[1]
exp(1)^x/exp(1)^y

# Cummulative AVG and DISAGREEMENT
m$WK2 <- as.numeric(m$WK)
mean(m$AVGOrig[m$WK2<2])
mean(m$AVGOrig[m$WK2<6])
mean(m$DIS[m$WK2<2])
mean(m$DIS[m$WK2<6])


## Not used but could be useful: Calculate Effect Size Cohen's D
## http://www.r-bloggers.com/short-r-script-to-plot-effect-sizes-cohen%E2%80%99s-d-and-shade-overlapping-area/
## http://en.wikipedia.org/wiki/Effect_size#Cohen.27s_.C6.922
m1 <- mean(m$DEP[m$TSOFAR<quantile(m$TSOFAR, c(.5))])
m2 <- mean(m$DEP[m$TSOFAR>quantile(m$TSOFAR, c(.5))])
cohenD <- ( m2 - m1 ) / sd(m$DEP)
cohenD



#######################################
### Table: Appendix - Random Effects Model
#######################################
fit.lmer2 <- lmer( DEP ~ THEATERS + RPT + TSOFAR + AVG + I(AVG^2) + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY + (1| movieID), m)
fit.lmer3 <- lmer( DEP ~ THEATERS + RPT + TSOFAR + CRIT + oscarNominations + oscarWins + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY + (1| movieID), m)
fit.lmer4 <- lmer( DEP ~ THEATERS + RPT + TSOFAR + DISCUM + CRIT + oscarNominations + oscarWins + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY + (1| movieID), m)
fit.lmer5 <- lmer( DEP ~ THEATERS + RPT + TSOFAR*THEATERS + TSOFAR*RPT + DISCUM*THEATERS + DISCUM*RPT + CRIT + oscarNominations + oscarWins + MPAA + genreSciFi + genreDrama + genreComedy + genreRomance + genreAction + genreThriller + sequel + COMPETITION + WK + YEAR + HOLIDAY + (1| movieID), m)

model=mtable(fit.lmer2, fit.lmer3, fit.lmer4, fit.lmer5, summary.stats=FALSE)
toLatex(model)		# print model to R console
# The latex code generated by this does not compile automatically but needs to be edited. the AVG^2 needs to be escaped with $AVG^2$
# So, because the code needs to be manually edited, we do not automatically overwrite the file
writeLines( toLatex(model), paste("mainTableRandomEffects.tex", sep=""))



